In-class Exercise 05

Loading Packages

pacman::p_load(tidyverse, jsonlite, SmartEDA, tidygraph, ggraph)

Importing Knowledge Graph Data

kg <- fromJSON("MC1_graph.json")

Inspect Structure

str(kg,max.level=1)
List of 5
 $ directed  : logi TRUE
 $ multigraph: logi TRUE
 $ graph     :List of 2
 $ nodes     :'data.frame': 17412 obs. of  10 variables:
 $ links     :'data.frame': 37857 obs. of  4 variables:

Extract and Inspect

nodes_tb1 <- as_tibble(kg$nodes)
edges_tb1 <- as_tibble(kg$links)

Initial EDA

ggplot(data = edges_tb1,
       aes(y =`Edge Type`)) +
     geom_bar()

ggplot(data = nodes_tb1,
       aes(y =`Node Type`)) +
     geom_bar()

Creating Knowledge Graph

This is

Step 1: Mapping from node id to row index

id_map<- tibble(id= nodes_tb1$id,
                index = seq_len(
                  nrow(nodes_tb1)))

This ensures each d from your node list is mapped to the correct row number.

Step 2 : Map source and target IDs to row indices

edges_tb1<- edges_tb1 %>%
  left_join(id_map,by = c("source"="id"))%>%
  rename(from= index)%>%
  left_join(id_map,by = c("target" =  "id"))%>%
  rename(to = index)

Step3: Filter out any unmatched(invalid) edges

edges_tb1 = edges_tb1 %>%
  filter(!is.na(from),!is.na(to))

Step 4: creating the graph

Lastly,tbl_grph() is used to create tidygraph’s graph object by using the code chunk below.

graph = tbl_graph(nodes = nodes_tb1,
                  edges = edges_tb1,
                  directed = TRUE)

Visualising the knowledge graph

set.seed(1234)

Visualising the whole Graph

ggraph(graph,layout = "fr") +
  geom_edge_link(alpha = 0.3,
                 colour = "gray")+
  geom_node_point(aes(color = `Node Type`),
                  size = 4) +
  geom_node_text(aes(label = name),
                 repel = TRUE,
                 size = 2.5) +
  theme_void()

Step 1: Filter edges to onliy “Memberof”

graph_memberof <- graph %>%
  activate(edges) %>%
  filter(`Edge Type` == "MemberOf")

Step 2: Extract only connected nodes(i.e. used in these edges)

used_node_indices<- graph_memberof %>%
  activate(edges) %>%
  as_tibble() %>%
  select(from, to) %>%
  unlist() %>%
  unique()

Step 3: Keep only those nodes

graph_memberof <- graph_memberof %>%
  activate(nodes) %>% 
  mutate(row_id = row_number()) %>% 
  filter(row_id %in% used_node_indices) %>% 
  select(-row_id) #optional cleanup

Plot the sub-graph

ggraph(graph_memberof,
       layout = "fr") +
  geom_edge_link(alpha = 0.5,
                 colour = "gray") +
  geom_node_point(aes(color = `Node Type`),
                  size = 1) +
  geom_node_text(aes(label = name),
                 repel = TRUE,
                 size = 2.5) +
  theme_void()
Warning: ggrepel: 789 unlabeled data points (too many overlaps). Consider
increasing max.overlaps